In [22]:
%matplotlib notebook
#%matplotlib inline
In [3]:
import pandas as pd
import numpy as np
from collections import Counter
from src.data.load_data import (load_iris, load_wine, 
                                load_diabetes, load_glass,
                                load_pima_diabetes)
import seaborn as sbn
from matplotlib import pyplot as plt

Iris Insight:

In [31]:
iris = load_iris()
print(iris.columns)
Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Class',
       'ClassIndex'],
      dtype='object')
In [30]:
sbn.pairplot(iris[['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Class']], hue='Class')
Out[30]:
<seaborn.axisgrid.PairGrid at 0x28e5a710d68>

Iris Stats:

In [55]:
iris_stats = iris.describe()
iris_stats = iris_stats.append(iris.nunique().rename('nunique').astype(int))
iris_stats
Out[55]:
SepalLength SepalWidth PetalLength PetalWidth ClassIndex Class
count 150.000000 150.000000 150.000000 150.000000 150.000000 NaN
mean 5.843333 3.054000 3.758667 1.198667 1.000000 NaN
std 0.828066 0.433594 1.764420 0.763161 0.819232 NaN
min 4.300000 2.000000 1.000000 0.100000 0.000000 NaN
25% 5.100000 2.800000 1.600000 0.300000 0.000000 NaN
50% 5.800000 3.000000 4.350000 1.300000 1.000000 NaN
75% 6.400000 3.300000 5.100000 1.800000 2.000000 NaN
max 7.900000 4.400000 6.900000 2.500000 2.000000 NaN
nunique 35.000000 23.000000 43.000000 22.000000 3.000000 3.0

Nb Unique Values

In [91]:
sum(iris_stats.loc['nunique'])
Out[91]:
129.0

Attributes Histograms & Distributions

In [81]:
fig, ax = plt.subplots(2,2, figsize=(18, 10))
col = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
ax[0][0].hist(iris[col[0]], bins = int(iris_stats[col[0]]['nunique']))
ax[0][0].set_xlabel(col[0])
ax[0][1].hist(iris[col[1]], bins = int(iris_stats[col[1]]['nunique']))
ax[0][1].set_xlabel(col[1])

ax[1][0].hist(iris[col[2]], bins = int(iris_stats[col[2]]['nunique']))
ax[1][0].set_xlabel(col[2])
ax[1][1].hist(iris[col[3]], bins = int(iris_stats[col[3]]['nunique']))
ax[1][1].set_xlabel(col[3])
plt.show()
In [83]:
fig, ax = plt.subplots(2,2, figsize=(18, 10))
sbn.distplot(iris['SepalLength'], bins = 50, ax=ax[0][0])
sbn.distplot(iris['SepalWidth'], bins=50, ax=ax[0][1])

sbn.distplot(iris['PetalLength'], bins = 50, ax=ax[1][0])
sbn.distplot(iris['PetalWidth'], bins=50, ax=ax[1][1])
plt.show()

Wine Insight:

In [84]:
wine = load_wine()
print(wine.columns)
Index(['Class', 'Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium',
       'TotalPhenols', 'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins',
       'ColorIntensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline',
       'ClassIndex'],
      dtype='object')
In [86]:
sbn.pairplot(wine[['Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium',
                   'TotalPhenols', 'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins',
                   'ColorIntensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline', 'Class']], hue='Class')
Out[86]:
<seaborn.axisgrid.PairGrid at 0x28e5a6700b8>

Wine Stats:

In [87]:
wine_stats = wine.describe()
wine_stats = wine_stats.append(wine.nunique().rename('nunique').astype(int))
wine_stats
Out[87]:
Class Alcohol MalicAcid Ash AlcalinityOfAsh Magnesium TotalPhenols Flavanoids NonflavanoidPhenols Proanthocyanins ColorIntensity Hue OD280/OD315 of diluted wines Proline ClassIndex
count 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000 178.000000
mean 1.938202 13.000618 2.336348 2.366517 19.494944 99.741573 2.295112 2.029270 0.361854 1.590899 5.058090 0.957449 2.611685 746.893258 0.938202
std 0.775035 0.811827 1.117146 0.274344 3.339564 14.282484 0.625851 0.998859 0.124453 0.572359 2.318286 0.228572 0.709990 314.907474 0.775035
min 1.000000 11.030000 0.740000 1.360000 10.600000 70.000000 0.980000 0.340000 0.130000 0.410000 1.280000 0.480000 1.270000 278.000000 0.000000
25% 1.000000 12.362500 1.602500 2.210000 17.200000 88.000000 1.742500 1.205000 0.270000 1.250000 3.220000 0.782500 1.937500 500.500000 0.000000
50% 2.000000 13.050000 1.865000 2.360000 19.500000 98.000000 2.355000 2.135000 0.340000 1.555000 4.690000 0.965000 2.780000 673.500000 1.000000
75% 3.000000 13.677500 3.082500 2.557500 21.500000 107.000000 2.800000 2.875000 0.437500 1.950000 6.200000 1.120000 3.170000 985.000000 2.000000
max 3.000000 14.830000 5.800000 3.230000 30.000000 162.000000 3.880000 5.080000 0.660000 3.580000 13.000000 1.710000 4.000000 1680.000000 2.000000
nunique 3.000000 126.000000 133.000000 79.000000 63.000000 53.000000 97.000000 132.000000 39.000000 101.000000 132.000000 78.000000 122.000000 121.000000 3.000000

Nb Unique Values

In [90]:
sum(wine_stats.loc['nunique'])
Out[90]:
1282.0

Attributes Histograms & Distributions

In [99]:
col = wine.columns
fig, ax = plt.subplots(len(col), 1, figsize=(15, len(col)*5))

for i in range(len(col)):
    ax[i].hist(wine[col[i]], bins = int(wine_stats[col[i]]['nunique']))
    ax[i].set_xlabel(col[i])
plt.show()
In [97]:
col = wine.columns
fig, ax = plt.subplots(len(col), 1, figsize=(15, len(col)*5))

for i in range(len(col)):
    sbn.distplot(wine[col[i]], bins = int(wine_stats[col[i]]['nunique']), ax=ax[i])
plt.show()

Glass Insight:

In [110]:
glass = load_glass()
print(glass.columns)
Index(['RefractiveIndex', 'Sodium', 'Magnesium', 'Aluminum', 'Silicon',
       'Potassium', 'Calcium', 'Barium', 'Iron', 'Class', 'ClassIndex'],
      dtype='object')
In [111]:
sbn.pairplot(glass[['RefractiveIndex', 'Sodium', 'Magnesium', 'Aluminum', 'Silicon',
                    'Potassium', 'Calcium', 'Barium', 'Iron', 'Class']], hue='Class')
Out[111]:
<seaborn.axisgrid.PairGrid at 0x28e7692f780>

Glass Stats:

In [112]:
glass_stats = glass.describe()
glass_stats = glass_stats.append(glass.nunique().rename('nunique').astype(int))
glass_stats
Out[112]:
RefractiveIndex Sodium Magnesium Aluminum Silicon Potassium Calcium Barium Iron Class ClassIndex
count 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000 214.000000
mean 1.518365 13.407850 2.684533 1.444907 72.650935 0.497056 8.956963 0.175047 0.057009 2.780374 1.542056
std 0.003037 0.816604 1.442408 0.499270 0.774546 0.652192 1.423153 0.497219 0.097439 2.103739 1.707648
min 1.511150 10.730000 0.000000 0.290000 69.810000 0.000000 5.430000 0.000000 0.000000 1.000000 0.000000
25% 1.516523 12.907500 2.115000 1.190000 72.280000 0.122500 8.240000 0.000000 0.000000 1.000000 0.000000
50% 1.517680 13.300000 3.480000 1.360000 72.790000 0.555000 8.600000 0.000000 0.000000 2.000000 1.000000
75% 1.519157 13.825000 3.600000 1.630000 73.087500 0.610000 9.172500 0.000000 0.100000 3.000000 2.000000
max 1.533930 17.380000 4.490000 3.500000 75.410000 6.210000 16.190000 3.150000 0.510000 7.000000 5.000000
nunique 178.000000 142.000000 94.000000 118.000000 133.000000 65.000000 143.000000 34.000000 32.000000 6.000000 6.000000

Nb Unique Values

In [113]:
sum(glass_stats.loc['nunique'])
Out[113]:
951.0

Attributes Histograms & Distributions

In [114]:
col = glass.columns
fig, ax = plt.subplots(len(col), 1, figsize=(15, len(col)*5))

for i in range(len(col)):
    ax[i].hist(glass[col[i]], bins = int(glass_stats[col[i]]['nunique']))
    ax[i].set_xlabel(col[i])
plt.show()
In [116]:
col = glass.columns
fig, ax = plt.subplots(len(col), 1, figsize=(15, len(col)*5))

for i in range(len(col)):
    sbn.distplot(glass[col[i]], bins = int(glass_stats[col[i]]['nunique']), ax=ax[i])
plt.show()

Diabetes Insight

In [3]:
diabetes = load_diabetes()

print(diabetes.columns)
print(diabetes.dtypes)
Index(['index', 'Date', 'Time', 'Code', 'Value', 'namefile', 'DateTime'], dtype='object')
index                int64
Date                object
Time                object
Code                 int64
Value              float64
namefile            object
DateTime    datetime64[ns]
dtype: object
In [4]:
diabetes
Out[4]:
index Date Time Code Value namefile DateTime
0 0 04-21-1991 9:09 58 100.0 data-01 1991-04-21 09:09:00
1 1 04-21-1991 9:09 33 9.0 data-01 1991-04-21 09:09:00
2 2 04-21-1991 9:09 34 13.0 data-01 1991-04-21 09:09:00
3 3 04-21-1991 17:08 62 119.0 data-01 1991-04-21 17:08:00
4 4 04-21-1991 17:08 33 7.0 data-01 1991-04-21 17:08:00
5 5 04-21-1991 22:51 48 123.0 data-01 1991-04-21 22:51:00
6 6 04-22-1991 7:35 58 216.0 data-01 1991-04-22 07:35:00
7 7 04-22-1991 7:35 33 10.0 data-01 1991-04-22 07:35:00
8 8 04-22-1991 7:35 34 13.0 data-01 1991-04-22 07:35:00
9 9 04-22-1991 13:40 33 2.0 data-01 1991-04-22 13:40:00
10 10 04-22-1991 16:56 62 211.0 data-01 1991-04-22 16:56:00
11 11 04-22-1991 16:56 33 7.0 data-01 1991-04-22 16:56:00
12 12 04-23-1991 7:25 58 257.0 data-01 1991-04-23 07:25:00
13 13 04-23-1991 7:25 33 11.0 data-01 1991-04-23 07:25:00
14 14 04-23-1991 7:25 34 13.0 data-01 1991-04-23 07:25:00
15 15 04-23-1991 17:25 62 129.0 data-01 1991-04-23 17:25:00
16 16 04-23-1991 17:25 33 7.0 data-01 1991-04-23 17:25:00
17 17 04-24-1991 7:52 58 239.0 data-01 1991-04-24 07:52:00
18 18 04-24-1991 7:52 33 10.0 data-01 1991-04-24 07:52:00
19 19 04-24-1991 7:52 34 14.0 data-01 1991-04-24 07:52:00
20 20 04-24-1991 12:00 33 4.0 data-01 1991-04-24 12:00:00
21 21 04-24-1991 17:10 62 129.0 data-01 1991-04-24 17:10:00
22 22 04-24-1991 22:09 48 340.0 data-01 1991-04-24 22:09:00
23 23 04-24-1991 22:09 33 5.0 data-01 1991-04-24 22:09:00
24 24 04-25-1991 7:29 58 67.0 data-01 1991-04-25 07:29:00
25 25 04-25-1991 7:29 33 9.0 data-01 1991-04-25 07:29:00
26 26 04-25-1991 7:29 34 14.0 data-01 1991-04-25 07:29:00
27 27 04-25-1991 12:49 33 4.0 data-01 1991-04-25 12:49:00
28 28 04-25-1991 17:24 62 206.0 data-01 1991-04-25 17:24:00
29 29 04-25-1991 17:24 33 7.0 data-01 1991-04-25 17:24:00
... ... ... ... ... ... ... ...
29212 311 05-04-1989 18:00 34 6.0 data-70 1989-05-04 18:00:00
29213 312 05-04-1989 22:00 48 129.0 data-70 1989-05-04 22:00:00
29214 313 05-05-1989 08:00 58 134.0 data-70 1989-05-05 08:00:00
29215 314 05-05-1989 08:00 33 1.0 data-70 1989-05-05 08:00:00
29216 315 05-05-1989 08:00 34 7.0 data-70 1989-05-05 08:00:00
29217 316 05-05-1989 18:00 33 1.0 data-70 1989-05-05 18:00:00
29218 317 05-05-1989 18:00 34 6.0 data-70 1989-05-05 18:00:00
29219 318 05-06-1989 08:00 33 1.0 data-70 1989-05-06 08:00:00
29220 319 05-06-1989 08:00 34 7.0 data-70 1989-05-06 08:00:00
29221 320 05-06-1989 18:00 62 378.0 data-70 1989-05-06 18:00:00
29222 321 05-06-1989 18:00 33 1.5 data-70 1989-05-06 18:00:00
29223 322 05-06-1989 18:00 34 6.0 data-70 1989-05-06 18:00:00
29224 323 05-07-1989 08:00 33 1.0 data-70 1989-05-07 08:00:00
29225 324 05-07-1989 08:00 34 7.0 data-70 1989-05-07 08:00:00
29226 325 05-07-1989 12:00 60 151.0 data-70 1989-05-07 12:00:00
29227 326 05-07-1989 18:00 33 1.0 data-70 1989-05-07 18:00:00
29228 327 05-07-1989 18:00 34 6.0 data-70 1989-05-07 18:00:00
29229 328 05-07-1989 22:00 48 265.0 data-70 1989-05-07 22:00:00
29230 329 05-08-1989 08:00 58 248.0 data-70 1989-05-08 08:00:00
29231 330 05-08-1989 08:00 33 1.0 data-70 1989-05-08 08:00:00
29232 331 05-08-1989 08:00 34 7.0 data-70 1989-05-08 08:00:00
29233 332 05-08-1989 18:00 33 1.0 data-70 1989-05-08 18:00:00
29234 333 05-08-1989 18:00 34 6.0 data-70 1989-05-08 18:00:00
29235 334 05-08-1989 22:00 48 145.0 data-70 1989-05-08 22:00:00
29236 335 05-09-1989 08:00 58 259.0 data-70 1989-05-09 08:00:00
29237 336 05-09-1989 08:00 33 1.0 data-70 1989-05-09 08:00:00
29238 337 05-09-1989 08:00 34 7.0 data-70 1989-05-09 08:00:00
29239 338 05-10-1989 08:00 34 7.0 data-70 1989-05-10 08:00:00
29240 339 05-11-1989 08:00 34 7.0 data-70 1989-05-11 08:00:00
29241 340 05-12-1989 08:00 34 7.0 data-70 1989-05-12 08:00:00

29242 rows × 7 columns

In [6]:
sbn.pairplot(diabetes[['Date', 'Time', 'Code', 'Value', 'DateTime']])#, hue='Class')
Out[6]:
<seaborn.axisgrid.PairGrid at 0x1ffb29b7630>

Diabetes Stats:

In [5]:
diabetes_stats = diabetes.describe(include=[np.int64, np.float64, np.datetime64])
diabetes_stats = diabetes_stats.append(diabetes.nunique().rename('nunique').astype(int))
diabetes_stats
Out[5]:
index Code Value DateTime Date Time namefile
count 29242.000000 29242.000000 29242.000000 29242 NaN NaN NaN
unique NaN NaN NaN 14739 NaN NaN NaN
top NaN NaN NaN 1989-04-07 08:00:00 NaN NaN NaN
freq NaN NaN NaN 15 NaN NaN NaN
first NaN NaN NaN 1988-03-27 08:00:00 NaN NaN NaN
last NaN NaN NaN 1991-09-23 21:10:00 NaN NaN NaN
mean 317.061521 46.495862 79.421979 NaN NaN NaN NaN
std 286.642104 13.368426 93.523168 NaN NaN NaN NaN
min 0.000000 33.000000 0.000000 NaN NaN NaN NaN
25% 105.000000 33.000000 6.000000 NaN NaN NaN NaN
50% 220.000000 48.000000 24.000000 NaN NaN NaN NaN
75% 460.000000 60.000000 142.000000 NaN NaN NaN NaN
max 1326.000000 72.000000 501.000000 NaN NaN NaN NaN
nunique 1327.000000 21.000000 447.000000 14739 1140.0 1293.0 70.0

Nb Unique Values

In [9]:
sum(diabetes_stats.loc['nunique'])
Out[9]:
19037.0

Attributes Histograms & Distributions

In [6]:
col = ['Code', 'Value']#, 'DateTime']
fig, ax = plt.subplots(len(col), 1, figsize=(15, len(col)*5))

for i in range(len(col)):
    ax[i].hist(diabetes[col[i]], bins = int(diabetes_stats[col[i]]['nunique']))
    ax[i].set_xlabel(col[i])
plt.show()
In [7]:
col = col = ['Code', 'Value']
fig, ax = plt.subplots(len(col), 1, figsize=(15, len(col)*5))

for i in range(len(col)):
    sbn.distplot(diabetes[col[i]], bins = int(diabetes_stats[col[i]]['nunique']), ax=ax[i])
plt.show()

Pima Diabetes Insight

In [4]:
pima = load_pima_diabetes()
print(pima.columns)
Index(['NbPregnancies', 'PlasmaGlucoseConcentration', 'DiastolicBloodPressure',
       'TricepsSkinFoldThickness', 'TwoHourSerumInsulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Class', 'ClassIndex'],
      dtype='object')
In [5]:
sbn.pairplot(pima[['NbPregnancies', 'PlasmaGlucoseConcentration', 'DiastolicBloodPressure',
                   'TricepsSkinFoldThickness', 'TwoHourSerumInsulin', 'BMI',
                   'DiabetesPedigreeFunction', 'Age', 'Class']], hue='Class')
Out[5]:
<seaborn.axisgrid.PairGrid at 0x1f82654d278>

Pima Stats:

In [6]:
pima_stats = pima.describe()
pima_stats = pima_stats.append(pima.nunique().rename('nunique').astype(int))
pima_stats
Out[6]:
NbPregnancies PlasmaGlucoseConcentration DiastolicBloodPressure TricepsSkinFoldThickness TwoHourSerumInsulin BMI DiabetesPedigreeFunction Age Class ClassIndex
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958 0.651042
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000 1.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000 1.000000
nunique 17.000000 136.000000 47.000000 51.000000 186.000000 248.000000 517.000000 52.000000 2.000000 2.000000

Nb Unique Values

In [8]:
sum(pima_stats.loc['nunique'])
Out[8]:
1258.0

Attributes Histograms & Distributions

In [9]:
col = pima.columns
fig, ax = plt.subplots(len(col), 1, figsize=(15, len(col)*5))

for i in range(len(col)):
    ax[i].hist(pima[col[i]], bins = int(pima_stats[col[i]]['nunique']))
    ax[i].set_xlabel(col[i])
plt.show()
In [10]:
col = pima.columns
fig, ax = plt.subplots(len(col), 1, figsize=(15, len(col)*5))

for i in range(len(col)):
    sbn.distplot(pima[col[i]], bins = int(pima_stats[col[i]]['nunique']), ax=ax[i])
plt.show()
In [11]:
from sklearn.manifold import TSNE
In [14]:
tsne=TSNE(3)
g = tsne.fit_transform(pima[['NbPregnancies', 'PlasmaGlucoseConcentration', 'DiastolicBloodPressure',
                           'TricepsSkinFoldThickness', 'TwoHourSerumInsulin', 'BMI',
                           'DiabetesPedigreeFunction', 'Age']].values)
In [23]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter3D(g[:,0], g[:,1], g[:,2],c=pima.Class.values)
Out[23]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1f82c51b390>